In [40]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import numpy as np
In [43]:
kobe = pd.read_csv('data.csv', sep=',')
kobe= kobe[np.isfinite(kobe['shot_made_flag'])]
del kobe['lat']
del kobe['lon']
del kobe['game_id']
del kobe['team_id']
del kobe['team_name']
kobe_2 = pd.read_csv('data.csv', sep=',')
kobe_2= kobe_2[np.isfinite(kobe_2['shot_made_flag'])]
del kobe_2['lat']
del kobe_2['lon']
del kobe_2['game_id']
del kobe_2['team_id']
del kobe_2['team_name']
In [26]:
mt_up = preprocessing.LabelEncoder()
kobe.matchup = mt_up.fit_transform(kobe.matchup )
#kobe_2.matchup = mt_up.fit_transform(kobe.matchup )
opp = preprocessing.LabelEncoder()
kobe.opponent = opp.fit_transform(kobe.opponent )
#kobe_2.opponent = opp.fit_transform(kobe.opponent )
dt = preprocessing.LabelEncoder()
kobe.game_date = dt.fit_transform(kobe.game_date )
#kobe_2.game_date = dt.fit_transform(kobe.game_date )
at = preprocessing.LabelEncoder()
kobe.action_type = at.fit_transform(kobe.action_type )
#kobe_2.action_type = at.fit_transform(kobe.action_type )
cst = preprocessing.LabelEncoder()
kobe.combined_shot_type = cst.fit_transform(kobe.combined_shot_type )
#kobe_2.combined_shot_type = cst.fit_transform(kobe.combined_shot_type )
seson = preprocessing.LabelEncoder()
kobe.season = seson.fit_transform(kobe.season )
#kobe_2.season = seson.fit_transform(kobe.season )
st = preprocessing.LabelEncoder()
kobe.shot_type = st.fit_transform(kobe.shot_type )
#kobe_2.shot_type = st.fit_transform(kobe.shot_type )
sza = preprocessing.LabelEncoder()
kobe.shot_zone_area = sza.fit_transform(kobe.shot_zone_area )
#kobe_2.shot_zone_area = sza.fit_transform(kobe.shot_zone_area )
szb = preprocessing.LabelEncoder()
kobe.shot_zone_basic = szb.fit_transform(kobe.shot_zone_basic )
#kobe_2.shot_zone_basic = szb.fit_transform(kobe.shot_zone_basic )
szr = preprocessing.LabelEncoder()
kobe.shot_zone_range = szr.fit_transform(kobe.shot_zone_range )
#kobe_2.shot_zone_range = szr.fit_transform(kobe.shot_zone_range )
Out[26]:
In [44]:
from sklearn.cross_validation import train_test_split
# Generate the training set. Set random_state to be able to replicate results.
train = kobe.sample(frac=0.6, random_state=1)
train_2 = kobe_2.sample(frac=0.6, random_state=1)
# Select anything not in the training set and put it in the testing set.
test = kobe.loc[~kobe.index.isin(train.index)]
test_2 = kobe_2.loc[~kobe_2.index.isin(train_2.index)]
In [45]:
columns = kobe.columns.tolist()
columns = [c for c in columns if c not in ["shot_made_flag","team_id","team_name"]]
kobe_train_x =train[columns]
kobe_test_x =test[columns]
kobe_train_y=train['shot_made_flag']
kobe_test_y=test['shot_made_flag']
print(kobe_train_x.shape)
print(kobe_test_x.shape)
print(kobe_train_y.shape)
print(kobe_test_y.shape)
In [77]:
def optimization(depth, n_est,l_r):
maxacc=0
best_depth=0
best_n_est=0
best_l_r=0
for i in range(1,depth):
for j in n_est:
for k in l_r:
gbm = xgb.XGBClassifier(max_depth=i, n_estimators=j, learning_rate=k).fit(kobe_train_x, kobe_train_y)
predicted = gbm.predict(kobe_test_x)
key=str(i)+"_"+str(j)+"_"+str(k)
accu=accuracy_score(kobe_test_y, predicted)
if(accu>maxacc):
maxacc=accu
best_depth=i
best_n_est=j
best_l_r=k
print(maxkey+" "+str(maxacc))
return(best_depth,best_n_est,best_l_r)
n_est=[5,10,20,50,100,150,200,250,300,350,400,450,500,550,600,650,700,750,800,850,900,950,1000]
depth=10
l_r = [0.0001, 0.001, 0.01,0.05, 0.1, 0.2, 0.3]
best_depth,best_n_est,best_l_r=optimization(depth,n_est,l_r)
In [5]:
#hard coded the best features
gbm = xgb.XGBClassifier(max_depth=4, n_estimators=600, learning_rate=0.01).fit(kobe_train_x, kobe_train_y)
predicted = gbm.predict(kobe_test_x)
# summarize the fit of the model
print(metrics.classification_report(kobe_test_y, predicted))
print("Confusion Matrix")
print(metrics.confusion_matrix(kobe_test_y, predicted))
accuracy=accuracy_score(kobe_test_y, predicted)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
In [47]:
test_2['predicted']=predicted
test_2.to_csv(path_or_buf='test_with_predictions.csv', sep=',')
test_2.head(10)
Out[47]: